library(foreign)                        #read.spss (not used)
library(memisc)                         #spss.portable.file
library(plyr)                           #ldply
library(stringr)                        #str_sub, str_pad
library(dplyr)
library(xtable)
library(stm)

the_prefix <- ""

setwd(the_prefix)

load_it <- 1
if (load_it>0) {
    load(file="./data/kff_main_replication_data_with_psraid.RData")
    load_it <- load_it+1
}

doc_to_tdm <- function (out, binary = TRUE) {
    #### from R package 'parrot' github.com/wilryh/parrot
    if (!requireNamespace("reshape2", quietly = TRUE)) {
        stop("Package \"reshape2\" needed for this function to work. Please install it.",
            call. = FALSE)
    }
    d2 <- reshape2::melt(lapply(out$documents, function(x) x[1,
        ]))
    d2 <- data.frame(reshape2::melt(lapply(out$documents, function(x) x[1,
        ])), count = reshape2::melt(lapply(out$documents, function(x) x[2,
        ]))[, 1])
    if (binary) {
        tdm <- Matrix::sparseMatrix(as.numeric(d2[, 2]), d2[,
            1], x = rep(1, length(d2[, 3])))
    }
    else {
        tdm <- Matrix::sparseMatrix(as.numeric(d2[, 2]), d2[,
            1], x = d2[, 3])
    }
    tdm <- tdm[Matrix::rowSums(tdm) > 0, ]
    colnames(tdm) <- out$vocab
    rownames(tdm) <- names(out$documents)
    return(tdm)
}



sdat$INSURANCETYPE <- "Other"
sdat$INSURANCETYPE[sdat$MEDICAID==1] <- "Medicaid"
sdat$INSURANCETYPE[sdat$MEDICARESR==1] <- "Medicare"
sdat$INSURANCETYPE[sdat$EMPLINSURE==1] <- "Employer"
sdat$INSURANCETYPE[sdat$SELFINSURE==1] <- "Purchased"
sdat$INSURANCETYPE[sdat$COVERED==0] <- "None"

mean.na <- function(x){
	mean(x,na.rm=T)
}

load_it2 <- 1
if (load_it>0) {
    source("./code/health_survey_load_helped_hurt.R")
    load_it2 <- load_it2+1
    sdat <- merge(sdat, kff_helped_hurt, by=c("PSRAID","NUMBER"), all.x=T)
    sdat$helped <- sdat$helped == "1"
    sdat$hurt <- sdat$hurt == "1"
    }


processed <- textProcessor(
    subset(sdat, COVERED == 0 & hurt)$hurt_vb,
    subset(sdat, COVERED == 0 & hurt),
    stem = FALSE
)

out <- prepDocuments(
    processed$documents,
    processed$vocab,
    processed$meta,
    lower.thresh = 0
    )

tdm <- doc_to_tdm(out)
tdm <- tdm


#### mutual information calculation
np <- sum(out$meta$DATE.y >= "2014-01-01")
ns <- sum(out$meta$DATE.y < "2014-01-01")
D = np + ns
m <- Matrix::t((tdm)) # tdm is row-document column-word, m is transpose of that
nj <- apply(m,1,function (x) sum(x>0))
nnotj <- apply(m,1,function (x) sum(x==0))
njp <- apply(m[,subset(out$meta)$DATE.y >= "2014-01-01"], 1, function (x) sum(x>0))
njs <- apply(m[,subset(out$meta)$DATE.y < "2014-01-01"], 1, function (x) sum(x>0))
nnotjp <- apply(m[,subset(out$meta)$DATE.y >= "2014-01-01"], 1, function (x)
    sum(x==0))
nnotjs <- apply(m[,subset(out$meta)$DATE.y < "2014-01-01"], 1, function (x)
    sum(x==0))

mi <- njp/D*log((njp*D)/(np*nj),2)+ njs/D*log((njs*D)/(nj*ns),2) +
          nnotjp/D*log((nnotjp*D)/(np*nnotj),2) +
                        nnotjs/D*log((nnotjs*D)/(nnotj*ns),2)
names(mi) <- colnames(tdm)   # add vocab labels to mi vector

sort(mi[njp/np-njs/ns>0], decreasing=T)[1:25] # top 25 subset(out$meta) words
sort(mi[njp/np-njs/ns<0], decreasing=T)[1:25] # top 25 subset(out$meta) words

uninsured_post <- names(sort(mi[njp/np-njs/ns>0], decreasing=T)[1:25])
uninsured_pre <- names(sort(mi[njp/np-njs/ns<0], decreasing=T)[1:25])

print(
    xtable(data.frame(uninsured_pre, uninsured_post)), include.rownames=F,
    file="./figs/tableA3_uninsured_keywords.tex"
    )




processed <- textProcessor(
    subset(sdat, SELFINSURE == 1 & helped)$helped_vb,
    subset(sdat, SELFINSURE == 1 & helped),
    stem = FALSE
)

out <- prepDocuments(
    processed$documents,
    processed$vocab,
    processed$meta,
    lower.thresh = 0
    )

tdm <- doc_to_tdm(out)
tdm <- tdm


np <- sum(out$meta$DATE.y >= "2014-01-01")
ns <- sum(out$meta$DATE.y < "2014-01-01")
D = np + ns
m <- Matrix::t((tdm)) # tdm is row-document column-word, m is transpose of that
nj <- apply(m,1,function (x) sum(x>0))
nnotj <- apply(m,1,function (x) sum(x==0))
njp <- apply(m[,subset(out$meta)$DATE.y >= "2014-01-01"], 1, function (x) sum(x>0))
njs <- apply(m[,subset(out$meta)$DATE.y < "2014-01-01"], 1, function (x) sum(x>0))
nnotjp <- apply(m[,subset(out$meta)$DATE.y >= "2014-01-01"], 1, function (x)
    sum(x==0))
nnotjs <- apply(m[,subset(out$meta)$DATE.y < "2014-01-01"], 1, function (x)
    sum(x==0))

mi <- njp/D*log((njp*D)/(np*nj),2)+ njs/D*log((njs*D)/(nj*ns),2) +
          nnotjp/D*log((nnotjp*D)/(np*nnotj),2) +
                        nnotjs/D*log((nnotjs*D)/(nnotj*ns),2)
names(mi) <- colnames(tdm)   # add vocab labels to mi vector

sort(mi[njp/np-njs/ns>0], decreasing=T)[1:25] # top 25 subset(out$meta) words
sort(mi[njp/np-njs/ns<0], decreasing=T)[1:25] # top 25 subset(out$meta) words

selfinsured_post <- names(sort(mi[njp/np-njs/ns>0], decreasing=T)[1:25])
selfinsured_pre <- names(sort(mi[njp/np-njs/ns<0], decreasing=T)[1:25])

print(
    xtable(data.frame(selfinsured_pre, selfinsured_post)), include.rownames=F,
    file="./figs/tableA4_selfinsured_keywords.tex"
)
